In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv('./data/titanic-train.csv')
In [3]:
type(df)
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
# Getting info about the DataFrame
df.info()
In [6]:
# Getting basic information about each column in the DataFrame
df.describe()
Out[6]:
In [7]:
df.iloc[3]
Out[7]:
In [8]:
df.loc[0:4,'Ticket']
Out[8]:
In [9]:
df['Ticket'].head()
Out[9]:
In [10]:
df[['Embarked', 'Ticket']].head()
Out[10]:
In [11]:
# Selecting part of the DataFrame where value in the column 'Age' > 70
df[df['Age'] > 70]
Out[11]:
In [12]:
df['Age'] > 70
Out[12]:
In [13]:
df.query("Age > 70")
Out[13]:
In [14]:
df[(df['Age'] == 11) & (df['SibSp'] == 5)]
Out[14]:
In [15]:
df[(df.Age == 11) | (df.SibSp == 5)]
Out[15]:
In [16]:
df.query('(Age == 11) | (SibSp == 5)')
Out[16]:
In [17]:
df['Embarked'].unique()
Out[17]:
In [18]:
# Sorting descending by column 'Age'
df.sort_values('Age', ascending = False).head()
Out[18]:
In [19]:
# Sorting ascending by column 'Age'
df.sort_values('Age', ascending = True).head()
Out[19]:
In [20]:
# Counting representation in the column
df['Survived'].value_counts()
Out[20]:
In [21]:
df['Pclass'].value_counts()
Out[21]:
In [22]:
df.groupby(['Pclass', 'Survived'])['PassengerId'].count()
Out[22]:
In [23]:
# Min
df['Age'].min()
Out[23]:
In [24]:
# Max
df['Age'].max()
Out[24]:
In [25]:
# Mean
df['Age'].mean()
Out[25]:
In [26]:
# Median
df['Age'].median()
Out[26]:
In [27]:
mean_age_by_survived = df.groupby('Survived')['Age'].mean()
mean_age_by_survived
Out[27]:
In [28]:
std_age_by_survived = df.groupby('Survived')['Age'].std()
std_age_by_survived
Out[28]:
In [29]:
df1 = mean_age_by_survived.round(0).reset_index()
df2 = std_age_by_survived.round(0).reset_index()
In [30]:
df1
Out[30]:
In [31]:
df2
Out[31]:
In [32]:
df3 = pd.merge(df1, df2, on = 'Survived')
In [33]:
df3
Out[33]:
In [34]:
# Giving names to the columns
df3.columns = ['Survived', 'Average Age', 'Age Standard Deviation']
In [35]:
df3
Out[35]:
In [36]:
# Creating a pivot table
df.pivot_table(index = 'Pclass',
columns = 'Survived',
values = 'PassengerId',
aggfunc = 'count')
Out[36]:
In [37]:
df['IsFemale'] = (df['Sex'] == 'female')
In [38]:
df['IsFemale'].head()
Out[38]:
In [39]:
correlated_with_survived = df.corr()['Survived'].sort_values()
correlated_with_survived
Out[39]:
In [40]:
%matplotlib inline
In [41]:
# Plotting correlation with column 'Survived'
# Last column is omitted, because correlation between column 'Survived' and column 'Survived' is 1.
correlated_with_survived.iloc[:-1].plot(kind='bar',
title='Titanic Passengers: correlation with survival')
Out[41]:
In [42]:
df.corr()
Out[42]:
In [43]:
data1 = np.random.normal(0, 0.1, 1000)
data2 = np.random.normal(1, 0.4, 1000) + np.linspace(0, 1, 1000)
data3 = 2 + np.random.random(1000) * np.linspace(1, 5, 1000)
data4 = np.random.normal(3, 0.2, 1000) + 0.3 * np.sin(np.linspace(0, 20, 1000))
In [44]:
# Stacking and transposing is equal to stacking columns
data = np.vstack([data1, data2, data3, data4]).transpose()
In [45]:
df = pd.DataFrame(data, columns = ['data1', 'data2', 'data3', 'data4'])
df.head()
Out[45]:
In [46]:
df.plot(title='Line plot', figsize = (7, 7),)
Out[46]:
In [47]:
# Adding title and legend
plt.plot(df)
plt.title('Line plot')
plt.legend(['data1', 'data2', 'data3', 'data4'])
Out[47]:
In [48]:
df.plot(style = '.',
figsize = (7, 7),)
Out[48]:
In [49]:
df.plot(kind = 'scatter',
figsize = (7, 7),
x = 'data1',
y = 'data2',
xlim = (-1.5, 1.5),
ylim = (0, 3))
Out[49]:
In [50]:
df.plot(kind = 'hist',
figsize = (7, 7),
bins = 50,
title = 'Histogram',
alpha = 0.6)
Out[50]:
In [51]:
df.plot(kind = 'hist',
figsize = (7, 7),
bins = 100,
title = 'Cumulative distributions',
normed = True,
cumulative = True,
alpha = 0.4)
Out[51]:
In [52]:
df.plot(kind = 'box',
figsize = (7, 7),
title = 'Boxplot')
Out[52]:
In [53]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
df.plot(ax=ax[0][0],
title='Line plot')
df.plot(ax=ax[0][1],
style='o',
title='Scatter plot')
df.plot(ax=ax[1][0],
kind='hist',
bins=50,
title='Histogram')
df.plot(ax=ax[1][1],
kind='box',
title='Boxplot')
plt.tight_layout()
In [54]:
gt01 = df['data1'] > 0.1
piecounts = gt01.value_counts()
piecounts
Out[54]:
In [55]:
# Plotting a pie chart
# explode : how far each piece of the pie is far from the centre
# autopct : formatting printed percentage
piecounts.plot(kind = 'pie',
figsize = (7, 7),
explode = [0.0, 0.15],
labels = ['<= 0.1', '> 0.1'],
#colors = ['#191970', '#001CF0'],
autopct = '%1.2f%%',
shadow = True,
startangle = 90,
fontsize = 16)
plt.legend(loc = "best")
Out[55]:
In [56]:
data = np.vstack([np.random.normal((0, 0), 2, size = (1000, 2)),
np.random.normal((9, 9), 3, size = (2000, 2))])
df = pd.DataFrame(data, columns = ['x', 'y'])
In [57]:
df.head()
Out[57]:
In [58]:
df.plot()
Out[58]:
In [59]:
df.plot(kind = 'kde')
Out[59]:
In [60]:
df.plot(kind='hexbin',
x = 'x',
y = 'y',
bins = 100,
cmap = 'rainbow')
Out[60]:
In [61]:
from PIL import Image
In [62]:
img = Image.open('./data/iss.jpg')
img
Out[62]:
In [63]:
type(img)
Out[63]:
In [64]:
imgarray = np.asarray(img)
In [65]:
type(imgarray)
Out[65]:
In [66]:
# 3 channels, each 425 by 640 pixels
imgarray.shape
Out[66]:
In [67]:
imgarray.ravel().shape
Out[67]:
In [68]:
435 * 640 * 3
Out[68]:
In [69]:
from scipy.io import wavfile
In [70]:
rate, sound = wavfile.read(filename='./data/sms.wav')
In [71]:
from IPython.display import Audio
In [72]:
Audio(data = sound, rate = rate)
Out[72]:
In [73]:
Audio(data = sound, rate = 0.5 * rate)
Out[73]:
In [74]:
len(sound)
Out[74]:
In [75]:
sound
Out[75]:
In [76]:
plt.plot(sound)
Out[76]:
In [77]:
plt.specgram(sound, NFFT=1024, Fs=44100)
plt.ylabel('Frequency (Hz)')
plt.xlabel('Time (s)')
Out[77]:
../data/international-airline-passengers.csv.info() and .head() commandspd.to_datetime() to change the column type of 'Month' to a datatime typedf.set_index() method
In [78]:
ex1 = pd.read_csv('./data/international-airline-passengers.csv')
In [79]:
ex1.head()
Out[79]:
In [80]:
ex1.info()
In [81]:
ex1['Month'] = pd.to_datetime(ex1['Month'])
In [82]:
ex1 = ex1.set_index('Month')
ex1.head()
Out[82]:
In [83]:
ex1.plot()
Out[83]:
In [84]:
ex2 = pd.read_csv('./data/weight-height.csv')
In [85]:
ex2.head()
Out[85]:
In [86]:
ex2.info()
In [87]:
ex2.describe()
Out[87]:
In [88]:
ex2.plot(kind = 'scatter',
x = 'Height',
y = 'Weight')
Out[88]:
In [89]:
ex2_males = ex2[ex2['Gender'] == 'Male']
ex2_females = ex2[ex2['Gender'] == 'Female']
In [90]:
fig, ax = plt.subplots(figsize = (10, 10))
ex2_males.plot(kind = 'scatter',
x = 'Height',
y = 'Weight',
ax = ax,
color = 'blue',
alpha = 0.2)
ex2_females.plot(kind = 'scatter',
x = 'Height',
y = 'Weight',
ax = ax,
color = 'red',
alpha = 0.2)
plt.title('Male')
Out[90]:
In [91]:
ex3_males = ex2_males
ex3_females = ex2_females
fig, ax = plt.subplots(figsize = (10, 10))
ex3_males['Height'].plot(kind = 'hist',
bins = 30,
color = 'blue',
alpha = 0.3)
ex3_females['Height'].plot(kind = 'hist',
bins = 30,
color = 'red',
alpha = 0.3)
plt.axvline(ex3_males['Height'].mean(),
linewidth = 3,
linestyle='dashed',
color = 'blue')
plt.axvline(ex3_females['Height'].mean(),
linewidth = 3,
linestyle='dashed',
color = 'red')
plt.legend()
Out[91]:
In [92]:
ex4 = ex2
ex4.head()
Out[92]:
In [93]:
ex4_pivot = ex4.pivot(columns = 'Gender',
values = 'Weight')
In [94]:
ex4_pivot.head()
Out[94]:
In [95]:
ex4_pivot.tail()
Out[95]:
In [96]:
ex4_pivot.plot(figsize = (10, 10),
kind = 'box',
yticks = [120, 150, 200, 250])
plt.grid()
plt.show()
../data/titanic-train.csv
In [97]:
ex5 = pd.read_csv('./data/titanic-train.csv')
ex5.head()
Out[97]:
In [98]:
# Dropping passenger Id
ex5 = ex5.drop('PassengerId', axis = 1)
In [99]:
from pandas.plotting import scatter_matrix
In [100]:
_ = scatter_matrix(ex5, figsize = (15, 15))
In [101]:
ex5['Age'].plot.kde(figsize = (7, 7))
Out[101]:
In [102]:
ex5['Fare'].plot.kde(figsize = (7, 7))
Out[102]:
In [103]:
ex5['SibSp'].plot.kde(figsize = (7, 7))
Out[103]: